Preprocessing Dataset

Libraries

Library for read dataset.

library(readr)

Library for data frames processing.

library(dplyr)
library(tidyr)

Library for R Markdown.

library(rmarkdown)
library(knitr)

Library for data presentation.

library(scales)

Library for manage strings.

library(stringr)

Load dataset

Set the path of the dataframe file.

path_import = "../virusTotal/data/virusTotalOriginal.csv"
path_export = "../virusTotal/data/virusTotal.csv"

Load dataset.

df <- read_csv(path_import)

Statistics

Dimensions.

dim(df)
## [1] 183 447

Types

View witch types are in the dataset.

col_types_all <- 
  df %>% 
  sapply(typeof) %>% 
  unlist()

col_types_table <- 
  col_types_all %>% 
  table()

col_types <- 
  col_types_table %>% 
  as.vector()

names(col_types) <- names(col_types_table)
## character    double   logical 
##       158       204        85

As can be seen there are the three expected types: character, double and logical.

NA

Percentaje of NA values

Define function to see the amount of NA values in the dataframe.

percent_of_NA <- 
  function(df){
    num_of_NA <- 
      df %>% is.na() %>% sum()
    num_of_values <- 
      df %>% dim() %>% prod()
    percent_of_NA <- 
      (num_of_NA / num_of_values) %>% 
      percent()
    return(percent_of_NA)
  }
percent_of_NA(df)
## [1] "37%"

Columns with NA

Define functions to see the NA in columns.

num_of_NA_by_column <- 
  function(df){
    df %>% is.na() %>% colSums()
  }
remove_0 <- 
  function(x) x[x!=0]
names_of_colums_with_NA <- 
  function(df)
    df %>% 
      num_of_NA_by_column() %>% 
      remove_0 %>% 
      names()
percentaje_of_cols_with_NA <-
  function(df)
    (length(names_of_colums_with_NA(df)) / ncol(df)) %>% 
    percent()

Compute the percentaje of cols with NA.

percentaje_of_cols_with_NA(df)
## [1] "74%"

Inspect if there are columns full of NA.

is_full_of_NA <- function(col){
  num_of_NA <- 
    col %>% 
    is.na() %>% 
    sum()
  return(num_of_NA == length(col))
}
cols_full_of_NA <- 
  df %>% 
  select_if(is_full_of_NA) %>% 
  names()
##  [1] "authentihash"                  "scans.Bkav.result"            
##  [3] "scans.CMC.result"              "scans.ALYac.result"           
##  [5] "scans.Malwarebytes.result"     "scans.K7AntiVirus.result"     
##  [7] "scans.Baidu.result"            "scans.SUPERAntiSpyware.result"
##  [9] "scans.Gridinsoft.result"       "scans.ViRobot.result"         
## [11] "scans.BitDefenderTheta.result" "scans.TACHYON.result"         
## [13] "scans.VBA32.result"            "scans.Zoner.result"           
## [15] "scans.Panda.result"            "scans.Elastic.result"         
## [17] "scans.Cylance.result"          "scans.SentinelOne.result"

As can be seen there are many columns that are full of NA, so can be deleted.

df <- 
  select(df, -all_of(cols_full_of_NA))

Colums with the same value

Maybe there are columns that has the same value along all the vector, so are useless.

Define function to remove these columns.

different_values <- 
  function(x)
    x %>% na.omit() %>% unique() %>% length()
remove_columns_with_the_same_value <- 
  function(df)
      select_if(df, function(col) different_values(col) > 1)

Apply function.

num_of_cols_after_remove <- 
  df %>% 
  remove_columns_with_the_same_value() %>% 
  ncol()

Calculate the number of columns with same value.

ncol(df) - num_of_cols_after_remove
## [1] 147

Awesom! Many colums found. Let’s remove them.

df <- 
  remove_columns_with_the_same_value(df)

Inspecting dataframe

Now let’s deeply inspect into the dataframe.

View dataframe

View dataframe.

Renaming

The column “…1” is the row number, so “n” will be a better name. The “…JSON” it’s a bad name, just “json” is fine.

df <- 
  df %>% 
  rename(n = ...1, json=..JSON)

Removing cols

There are many duplicated cols, hashes & dates that can be removed, also many useless.

Dates

There are many dates in the dataset, that are not relevant for virus analysis. So let’s remove them.

Define a function for check if a col is of type Date.

not <- 
  function(x) !x

get_element <- 
  function(x, index) x[index]

is_date_col <- 
  function(col, pattern="^[:digit:]{4}[-:/][:digit:]{2}[-:/][:digit:]{2}")
    col %>% 
    as.character() %>% 
    na.omit() %>% 
    get_element(1) %>% 
    str_detect(pattern)

Columns detected.

df %>% 
  select_if(is_date_col) %>% 
  head() %>% 
  paged_table()

Define function for remove cols by a predicate.

remove_col_if <- 
  function(df, fun){
    cols_to_delete <- 
      df %>% 
      select_if(fun) %>% 
      colnames()
    df <- 
      df %>% 
      select(-cols_to_delete)
    return(df)
  }

Remove them.

df <- 
  remove_col_if(df, is_date_col)
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(cols_to_delete)` instead of `cols_to_delete` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

Hashes

There are many hashes cols that don’t really provide useful information. So remove them.

hashes <- 
  c("")

df <- 
  df %>% 
  select(
    -vhash,
    -sha256,
    -sha1,
    -scan_id,
    -ssdeep,
    -md5,
    -additional_info.androguard.certificate.serialnumber,
    -additional_info.androguard.certificate.thumbprint,
    -additional_info.exiftool.ZipCRC
  )

Scans

There are many scans of different antivirus, that has very similar information. Just keep the scan with less NA values.

Get the best col.

scans_col_names <- 
  df %>% 
  colnames() %>% 
  str_match_all("scans.[:alpha:]*.result") %>% 
  unlist()

scan_na_by_col <- 
  df %>% 
  select(all_of(scans_col_names)) %>%
  num_of_NA_by_column()

scan_col_witch_min_na <- 
  scan_na_by_col %>% 
  which.min()

best_scanner_colname <- 
  scan_na_by_col %>% 
  names() %>% 
  get_element(scan_col_witch_min_na)

best_scanner_col <- 
  df %>% 
  select(all_of(best_scanner_colname))

best_scanner_name <- 
  best_scanner_colname %>% 
  str_split("[.]") %>% 
  unlist() %>% 
  get_element(2)
## [1] "Best scan col: scans.Fortinet.result"
## [1] "Best scanner: Fortinet"

Drop all scans but the best.

col_index_scanners <- 
  df %>% 
  colnames() %>% 
  str_detect("scans") %>%
  unlist() %>% 
  which()

df <- 
  df %>% 
  select(-all_of(col_index_scanners)) %>% 
  cbind(best_scanner_col)

Individual columns

Reasons:

  • json column contains all the row as JSON.
  • permalink is the URL where Virus Total has the virus file.
  • Main.Activity & Package are strings with all different values.
  • FileTypeExtension, ZipFileName & MIMEType has same values as FileType.
  • ZipBitFlag doesn’t seems to matter.
  • additional_info.magic has the vesion of the ZIP file, that doesn’t seems to matter.
  • Subject.DN is the JSON fragment that has all the information about the subject, but these data are decomposed in the rest of Subject cols.
df <- 
  df %>% 
  select(
    -json,
    -permalink,
    -additional_info.androguard.AndroidApplicationInfo,
    -additional_info.androguard.Main.Activity,
    -additional_info.exiftool.MIMEType,
    -additional_info.exiftool.FileTypeExtension,
    -additional_info.exiftool.ZipFileName,
    -additional_info.magic,
    -additional_info.androguard.Package,
    -additional_info.androguard.certificate.Subject.DN
  )

Groups of columns

Define a function to remove cols which name match a pattern.

remove_cols_which_name_match <- 
  function(df, pattern){
    cols_to_remove <- 
      df %>% 
      colnames() %>% 
      str_which(pattern)
    df_removed_cols <- 
      df %>% 
      select(-all_of(cols_to_remove))
    return(df_removed_cols)
  }

Remove groups.

Reasons:

  • Issuer group has the same information as Subject group.
  • CompressedView and RiskIndicator.APK groups have the same information as file_type group
df <- 
  df %>% 
  remove_cols_which_name_match("^additional_info.androguard.certificate.Issuer.[:alpha:]*$") %>% 
  remove_cols_which_name_match("^additional_info.compressedview.extensions.[:alpha:]*$") %>% 
  remove_cols_which_name_match("^additional_info.androguard.RiskIndicator.APK.[:alpha:]*$")

View results

Define a function for sort columns.

sort_cols <- 
  function(df){
    df <- 
      df %>% 
      select(order(colnames(df)))
    additionalInfo_cols_logical <- 
      df %>% 
      colnames() %>% 
      str_detect("additional_info")
    additionalInfo_cols <- 
      df %>% 
      select(which(additionalInfo_cols_logical))
    not_additionalInfo_cols <- 
      df %>% 
      select(which(!additionalInfo_cols_logical)) %>% 
      select(n, size, everything())
    return(cbind(not_additionalInfo_cols, additionalInfo_cols))
  }

Sort columns.

df <- 
  sort_cols(df)

View results.

Replacing values

Replace “Unknown” and “?” by NA

There are some columns that hast the value “Unknown” or “?” instead of NAs. So let’s replace them.

Define a function to replace values in cols that satisfy a predicate.

replace_when <- 
  function(df, fun, value, replacement){
    cols_to_replace <- 
      df %>% 
      select_if(fun) %>% 
      colnames()
    df_replaced_cols <- 
      df %>% 
      select(all_of(cols_to_replace)) %>% 
      sapply(function(col) replace(col, which(col==value), replacement))
    df_without_replaced_cols <- 
      df %>% 
      select(-all_of(cols_to_replace))
    return(cbind(df_without_replaced_cols, df_replaced_cols))
  }

Replace ? and Unknown for NAs.

df <- 
  df %>% 
  replace_when(function(col) any(str_detect(col, fixed("Unknown"))), "Unknown", NA) %>% 
  replace_when(function(col) any(str_detect(col, fixed("?"))), "?", NA)

Replace NA for 0

Permissions colums

The permissions columns (PERM) seems that there are NAs where there should be 0s. So it would be better to replace them.

replace_na_which_colname_match <- 
  function(df, pattern, replacement){
    cols_to_replace <- 
      df %>% 
      colnames() %>% 
      str_which(pattern)
    df_replaced_cols <- 
      df %>% 
      select(all_of(cols_to_replace)) %>% 
      sapply(function(col) replace_na(col, replacement))
    df_without_replaced_cols <- 
      df %>% 
      select(-all_of(cols_to_replace))
    return(cbind(df_without_replaced_cols, df_replaced_cols))
  }
replace_na_when <- 
  function(df, fun, replacement){
    cols_to_replace <- 
      df %>% 
      select_if(fun) %>% 
      colnames()
    df_replaced_cols <- 
      df %>% 
      select(all_of(cols_to_replace)) %>% 
      sapply(function(col) replace_na(col, replacement))
    df_without_replaced_cols <- 
      df %>% 
      select(-all_of(cols_to_replace))
    return(cbind(df_without_replaced_cols, df_replaced_cols))
  }
pattern <- "additional_info.androguard.RiskIndicator.PERM"

df <- 
  df %>% 
  replace_na_which_colname_match(pattern, 0)

df %>% 
  select(., str_which(colnames(.), pattern)) %>% 
  paged_table()

New colums

There are columns that must be only one, others provide more information by operating two columns, summarises information.

New total permissions column

Create a new column that sums all permissions of permissions columns.

pattern <- "additional_info.androguard.RiskIndicator.PERM"

df_without_permissions <- 
  df %>% 
  select(., -(str_which(colnames(.), pattern)))

df_permissions <- 
  df %>% 
  select(., str_which(colnames(.), pattern)) %>% 
  mutate(., total_PERMs = rowSums(.))

df <- cbind(df_without_permissions, df_permissions)

Merge both MP3 columns into one

There are two mp3 extension columns, one for .MP3 and the other for .mp3, both are mp3 files. The others columns has the name of the extension in upper case, so let’s sum both into the MP3 column.

mp3_cols_logical <- 
  df %>% 
  colnames() %>% 
  tolower() %>% 
  str_detect("mp3")
mp3_upper_colname <- 
  colnames(df)[which(mp3_cols_logical)] %>% 
  str_match("^.*MP3$")
sprintf("MP3 colname: %s", mp3_upper_colname)
## [1] "MP3 colname: NA"                                           
## [2] "MP3 colname: additional_info.compressedview.file_types.MP3"

Sum them into NA, additional_info.compressedview.file_types.MP3

mp3_col <- 
  df %>% 
  select(which(mp3_cols_logical)) %>% 
  rowSums(na.rm = TRUE)

df <- 
  df %>% 
  select(-which(mp3_cols_logical)) %>% 
  mutate(additional_info.compressedview.file_types.MP3 = mp3_col)

View results

Sort columns.

df <- 
  sort_cols(df)

View results.

Save dataframe

After all preprocessing let’s save it into CSV.

write.csv(df, path_export)

Functions for preprocessing

As factor

labels <- 
  function(n){
    if(n == 5){
      return(c("very low", "low", "medium", "high", "very high"))
    }else if(n == 4){
      return(c("very low", "low", "high", "very high"))
    }else if(n == 3){
      return(c("low", "medium", "high"))
    }else if(n == 2){
      return(c("low", "high"))
    }else{
      stop("Not avalible")
    }
  }

cut_by_quantiles <- 
  function(col){
    quantiles <- 
      col %>% 
      quantile(na.rm = TRUE) %>% 
      unique()
    if(length(quantiles) > 2){
      col <- 
        col %>% 
        cut(breaks = quantiles, 
            labels = labels(length(quantiles)-1),
            include.lowest = TRUE)
    }
    return(col)
  }

df_cut_by_quantiles <- 
  function(df){
    df_without_numeric <- 
      df[sapply(df, function(col) !is.numeric(col))]
    df_numeric <- 
      df %>% 
      select_if(is.numeric)
    df_numeric <- 
      df_numeric %>% 
      lapply(cut_by_quantiles)
    return(cbind(df_without_numeric, df_numeric))
  }